The latest version on different platforms can be installed following instructions at http://bioconductor.org/install/#install-R.
Install packages used:
source("http://bioconductor.org/biocLite.R")
biocLite(c("supraHex","devtools","XGR","dplyr"))
devtools::install_github(c("hfang-bristol/supraHex", "hfang-bristol/XGR"))
A collection of aesthetic maps supported for the self-organised representation and comparison: a supra-hexagonal map, a ring-shaped map, a diamond map, a trefoil map, a butterfly map, an hourglass map, a ladder map, a bridge map, and a triangle-shaped map.
library(supraHex)
library(ggplot2)
colormap <- 'lightyellow-lightblue'
color <- 'black'
# For "supraHex" shape itself
sHex <- sHexGridVariant(r=6, shape="suprahex")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_suprahex <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "triangle" shape
sHex <- sHexGridVariant(r=6, shape="triangle")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_triangle <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "diamond" shape
sHex <- sHexGridVariant(r=6, shape="diamond")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_diamond <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "hourglass" shape
sHex <- sHexGridVariant(r=6, shape="hourglass")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_hourglass <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "trefoil" shape
sHex <- sHexGridVariant(r=6, shape="trefoil")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_trefoil <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "ladder" shape
sHex <- sHexGridVariant(r=6, shape="ladder")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_ladder <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "butterfly" shape
sHex <- sHexGridVariant(r=6, shape="butterfly")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_butterfly <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "ring" shape
sHex <- sHexGridVariant(r=6, shape="ring")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_ring <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
# For "bridge" shape
sHex <- sHexGridVariant(r=6, shape="bridge")
df_polygon <- sHexPolygon(sHex)
df_coord <- data.frame(sHex$coord, index=1:nrow(sHex$coord))
gp_bridge <- ggplot(data=df_polygon, aes(x,y,group=index)) + geom_polygon(aes(fill=factor(stepCentroid%%2))) + coord_fixed(ratio=1) + theme_void() + theme(legend.position="none") + scale_fill_manual(values=visColormap(colormap)(2)) + geom_text(data=df_coord, aes(x,y,label=index), color=color, size=2.5) + theme(plot.margin=unit(rep(0,4),rep("lines",4)))
gridExtra::grid.arrange(grobs=list(gp_suprahex,gp_ring,gp_diamond,gp_trefoil,gp_butterfly,gp_hourglass,gp_ladder,gp_bridge,gp_triangle), layout_matrix=rbind(c(1,1,2,2,3),c(1,1,2,2,3),c(4,4,5,5,6),c(4,4,5,5,6),c(7,7,8,8,9)), nrow=5, ncol=5)
In this showcase, we analyse TF and DHS datasets in K562, and compare TF datasets between K562 and Gm12878.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF and DHS datasets:
# extract TFBSs, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
## common to Gm12878 and K562
anno_gr_Gm12878 <- ENCODE_TFBS_ClusteredV3_CellTypes$GM12878
anno_gr_K562 <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
names_common <- sort(intersect(names(anno_gr_Gm12878), names(anno_gr_K562)))
ind <- match(names_common, names(anno_gr_Gm12878))
TF_gr_Gm12878 <- anno_gr_Gm12878[ind]
ind <- match(names_common, names(anno_gr_K562))
TF_gr_K562 <- anno_gr_K562[ind]
# extract DHS, stored in a list of GR objects
ENCODE_DNaseI_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_DNaseI_ClusteredV3_CellTypes', RData.location=RData.location)
## in K562
ind <- grep("K562", names(ENCODE_DNaseI_ClusteredV3_CellTypes))
DHS_gr_K562 <- ENCODE_DNaseI_ClusteredV3_CellTypes[ind]
Calculate gene-centric TF association scores:
anno_gr <- TF_gr_K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_K562 <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
Justify the use of the bridge-shaped map:
data <- G2TF_K562
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + geom_point(size=1,col='transparent') + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=48) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the bridge-shaped map using cross-TF gene scores:
data <- G2TF_K562
sMap <- sPipeline(data, shape="bridge", finetuneSustain=T)
Build neighbour-joining TF tree:
D <- t(sMap$codebook)
set.seed(825)
tree_bs <- visTreeBootstrap(D, num.bootstrap=1000, nodelabels.arg=list(frame="circle",cex=0.35,col="black"), plot.phylo.arg=list(type=c("phylogram","cladogram","fan","radial")[1],cex=0.6,node.pos=1))
Visualise TF map (reordered by tree):
tree_bs_ind <- match(tree_bs$tip.label, rownames(D))
xMap <- sMap
xMap$codebook <- xMap$codebook[,tree_bs_ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=51:1, rect.grid=c(7,8), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)
Calculate gene-centric DHS association scores:
anno_gr <- DHS_gr_K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=0, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, Cell=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2DHS <- as.matrix(xSparseMatrix(df[,c("Gene","Cell","Score")]))
The DHS map is obtained by overlaying the DHS data onto the trained TF map in K562:
ind <- match(rownames(G2TF_K562), rownames(G2DHS))
additional <- G2DHS[ind,]
additional[is.na(additional)] <- 0
additional <- matrix(additional, ncol=1)
rownames(additional) <- rownames(G2TF_K562)
colnames(additional) <- 'DHS'
sOverlay_DHS <- sMapOverlay(sMap, data=G2TF_K562, additional=additional)
Visualise DHS map:
colormap <- xColormap("jet.top")
visHexMulComp(sOverlay_DHS, rect.grid=c(1,1), title.rotate=0, colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=1), newpage=F)
Rankings of 51 TFs in terms of similarity (Pearson correlation) to the DHS map in K562:
M_K562 <- sMap$codebook
M_DHS <- sOverlay_DHS$codebook
y <- M_DHS[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_DHS <- xPolarDot(df, parallel=T, signature=F)
gp_DHS + labs(y="Pearson correlation") + ylim(0,1)
Calculate gene-centric TF association scores:
anno_gr <- TF_gr_Gm12878
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_Gm12878 <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
The TF map is obtained by overlaying the TF data in Gm12878 onto the trained TF map in K562:
ind <- match(rownames(G2TF_K562), rownames(G2TF_Gm12878))
additional <- G2TF_Gm12878[ind,]
additional[is.na(additional)] <- 0
sOverlay_Gm12878 <- sMapOverlay(sMap, data=G2TF_K562, additional=additional)
Rankings of 51 TFs in terms of similarity (Pearson correlation) in two cell lines (K562 and Gm12878):
M_K562 <- sMap$codebook
M_Gm12878 <- sOverlay_Gm12878$codebook
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
y <- M_Gm12878[,i]
names(y) <- 1:nrow(M_Gm12878)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_twin <- xPolarDot(df, parallel=T, signature=F)
gp_twin + labs(y="Pearson correlation") + ylim(0,1)
Visualise TF map (reordered by correlation):
## yMap
yMap <- sMap
ind <- match(df$TF, colnames(yMap$codebook))
yMap$codebook <- yMap$codebook[,ind]
## zMap
zMap <- sOverlay_Gm12878
ind <- match(df$TF, colnames(zMap$codebook))
zMap$codebook <- zMap$codebook[,ind]
## xMap
xMap <- sMap
nr <- nrow(xMap$codebook)
nc <- ncol(xMap$codebook)
matrix.combined <- matrix(NA, nrow=nr, ncol=nc*2)
matrix.combined[, seq(1,nc*2,2)] <- yMap$codebook
matrix.combined[, seq(2,nc*2,2)] <- zMap$codebook
xMap$codebook <- matrix.combined
colnames(xMap$codebook) <- rep(colnames(yMap$codebook), each=2)
## visualisation
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=1:(nc*2), rect.grid=c(13,8), title.rotate=0, title.xy=c(0.3,1), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.4), newpage=F)
Perform enrichment analysis for TFs:
## TFs (high correlation)
data <- subset(df, rank<=25)$TF
eTerm_high <- xEnricherGenes(data=data, ontology="SF", ontology.algorithm="none", RData.location=RData.location)
## TFs (low correlation)
data <- subset(df, rank>=27)$TF
eTerm_low <- xEnricherGenes(data=data, ontology="SF", ontology.algorithm="none", RData.location=RData.location)
## visualisation
list_eTerm <- list(eTerm_low, eTerm_high)
names(list_eTerm) <- c('TFs (low correlation)', 'TFs (high correlation)')
bp <- xEnrichCompare(list_eTerm, displayBy="fc", bar.label.size=2.5, signature=F)
bp + theme(axis.text.y=element_text(size=10))
Enriched domains for TFs (high correlation):
xEnrichViewer(eTerm_high, top=NULL, details=T)
> name nAnno nOverlap fc zscore
> 46785 "Winged helix" DNA-binding domain 212 6 17.40 9.72
> 47459 HLH, helix-loop-helix DNA-binding domain 113 3 16.30 6.61
> 57667 beta-beta-alpha zinc fingers 737 6 5.01 4.52
> pvalue adjp distance members
> 46785 2.8e-08 8.3e-08 a.4.5 E2F4, ELF1, ELK1, ETS1, RAD21, SPI1
> 47459 3.0e-05 4.4e-05 a.38.1 BHLHE40, USF1, USF2
> 57667 1.1e-04 1.1e-04 g.37.1 CTCF, EGR1, MAZ, YY1, ZBTB33, ZNF143
Enriched domains for TFs (low correlation):
xEnrichViewer(eTerm_low, top=NULL, details=T)
> name nAnno nOverlap fc zscore
> 57959 Leucine zipper domain 52 5 59.2 17.00
> 47459 HLH, helix-loop-helix DNA-binding domain 113 3 16.3 6.61
> 57667 beta-beta-alpha zinc fingers 737 3 2.5 1.70
> pvalue adjp distance members
> 57959 1.6e-10 4.9e-10 h.1.3 ATF3, CEBPB, FOS, JUND, NFE2
> 47459 3.0e-05 4.4e-05 a.38.1 MAX, MXI1, MYC
> 57667 2.9e-02 2.9e-02 g.37.1 REST, SP1, ZNF274
In this showcase, we analyse TF datasets in K562, Gm12878 and H1hESC.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF datasets shared by K562, Gm12878 and H1hESC:
# extract TFBSs, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
## common to H1-hESC, Gm12878 and K562
anno_gr_H1hESC <- ENCODE_TFBS_ClusteredV3_CellTypes$'H1-hESC'
names_common <- sort(base::Reduce(intersect, list(names(anno_gr_Gm12878), names(anno_gr_K562), names(anno_gr_H1hESC))))
ind <- match(names_common, names(anno_gr_K562))
TF_gr_K562_tri <- anno_gr_K562[ind]
ind <- match(names_common, names(anno_gr_Gm12878))
TF_gr_Gm12878_tri <- anno_gr_Gm12878[ind]
ind <- match(names_common, names(anno_gr_H1hESC))
TF_gr_H1hESC_tri <- anno_gr_H1hESC[ind]
Calculate gene-centric TF association scores:
## G2TF_K562_tri
anno_gr <- TF_gr_K562_tri
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_K562_tri <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
## G2TF_Gm12878_tri
anno_gr <- TF_gr_Gm12878_tri
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_Gm12878_tri <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
## G2TF_H1hESC_tri
anno_gr <- TF_gr_H1hESC_tri
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
G2TF_H1hESC_tri <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
Train the bridge-shaped map using cross-TF gene scores in K562:
data <- G2TF_K562_tri
sMap_tri <- sPipeline(data, shape="bridge", finetuneSustain=T)
TF map in Gm12878 (overlaid onto the trained TF map in K562):
ind <- match(rownames(G2TF_K562_tri), rownames(G2TF_Gm12878_tri))
additional <- G2TF_Gm12878_tri[ind,]
additional[is.na(additional)] <- 0
sOverlay_Gm12878_tri <- sMapOverlay(sMap_tri, data=G2TF_K562_tri, additional=additional)
TF map in H1hESC (overlaid onto the trained TF map in K562):
ind <- match(rownames(G2TF_K562_tri), rownames(G2TF_H1hESC_tri))
additional <- G2TF_H1hESC_tri[ind,]
additional[is.na(additional)] <- 0
sOverlay_H1hESC_tri <- sMapOverlay(sMap_tri, data=G2TF_K562_tri, additional=additional)
Correlation analysis of TF’s similarity between Gm12878 and H1hESC in terms of K562:
M_K562 <- sMap_tri$codebook
M_Gm12878 <- sOverlay_Gm12878_tri$codebook
M_H1hESC <- sOverlay_H1hESC_tri$codebook
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
## Gm12878
y <- M_Gm12878[,i]
names(y) <- 1:nrow(M_Gm12878)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary1 <- ls_res$df_summary
## H1hESC
y <- M_H1hESC[,i]
names(y) <- 1:nrow(M_H1hESC)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary2 <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], Gm12878=df_summary1$cor, H1hESC=df_summary2$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
## correlation plot
x <- df[,c('TF','Gm12878')]
y <- df$H1hESC
names(y) <- df$TF
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal", plot=T)
gp_tri <- ls_res$ls_gp[[1]]
gp_tri <- gp_tri + labs(x="Correlation between Gm12878 and K562", y="Correlation between H1hESC and K562") + xlim(0.5,1) + ylim(0.5,1)
gp_tri <- gp_tri + geom_abline(intercept=0, slope=1, color="gray50") + coord_fixed(ratio=1)
gp_tri <- gp_tri + ggrepel::geom_text_repel(data=gp_tri$data, aes(label=name), size=2, fontface='bold', box.padding=unit(0.2,"lines"), point.padding=unit(0.2,"lines"), segment.color='grey50', segment.alpha=0.5, arrow=arrow(length=unit(0.01,'npc')), force=0.1)
gp_tri
Visualise TF map (reordered by correlation difference):
## order by the difference
df$diff <- df$H1hESC - df$Gm12878
df <- df %>% dplyr::arrange(diff)
## xMap
xMap <- sMap_tri
ind <- match(df$TF, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
## yMap
yMap <- sOverlay_Gm12878_tri
ind <- match(df$TF, colnames(yMap$codebook))
yMap$codebook <- yMap$codebook[,ind]
## zMap
zMap <- sOverlay_H1hESC_tri
ind <- match(df$TF, colnames(zMap$codebook))
zMap$codebook <- zMap$codebook[,ind]
## aMap
aMap <- sMap_tri
nr <- nrow(xMap$codebook)
nc <- ncol(xMap$codebook)
matrix.combined <- matrix(NA, nrow=nr, ncol=nc*3)
### ordered by H1hESC Gm12878 K562
matrix.combined[, seq(1,nc*3,3)] <- zMap$codebook
matrix.combined[, seq(2,nc*3,3)] <- yMap$codebook
matrix.combined[, seq(3,nc*3,3)] <- xMap$codebook
aMap$codebook <- matrix.combined
colnames(aMap$codebook) <- rep(colnames((xMap$codebook)), each=3)
## visualisation
colormap <- xColormap("jet.top")
### reverse order by K562 Gm12878 H1hESC
visHexMulComp(aMap, which.components=(nc*3):1, rect.grid=c(9,9), title.rotate=0, title.xy=c(0.3,1), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.7), newpage=F)
In this showcase, we analyse TF datasets in K562 and expression datasets in chronic myeloid leukemia (CML, subdivided into blast crisis (BC), chronic phase (CP) and accelerated phase (AP)). Note, the K562 cell line was derived from a CML patient in blast crisis.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF datasets in K562 and expression datasets in CML:
# extract TF, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
# calculate gene-centric TF association scores
anno_gr <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
mat <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
# import CML expression
data.file <- file.path(RData.location,'GSE4170.txt')
data <- read.delim(data.file, header=TRUE, stringsAsFactors=FALSE)
data <- subset(data, gene!='')
## take average per gene
data <- as.data.frame(data %>% dplyr::group_by(gene) %>% dplyr::summarise(CP=mean(CP), AP=mean(AP), BC=mean(BC)))
# extract genes commons to CRISPR and TF datasets in K562
ind <- match(data$gene, rownames(mat))
G2TF_K562_expression <- mat[ind[!is.na(ind)],]
G2EXP_CML <- data.frame(data[!is.na(ind), c('CP','AP','BC')], stringsAsFactors=F)
rownames(G2EXP_CML) <- data$gene[!is.na(ind)]
Justify the use of the butterfly-shaped map:
Note: both supra-hexagonal and butterfly shapes are valid, and we choose the butterfly one as there exist two major clusters of data points.
data <- G2TF_K562_expression
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + geom_point(size=1,col='transparent') + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=30) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the diamond-shaped map using cross-TF gene scores:
data <- G2TF_K562_expression
sMe <- sPipeline(data, shape="butterfly", xdim=12, finetuneSustain=T)
The expression map is obtained by overlaying the CML expression data onto the trained TF map in K562:
sOverlay_EXP <- sMapOverlay(sMe, data=G2TF_K562_expression, additional=G2EXP_CML)
Visualise expression map:
colormap <- xColormap("jet.both")
visHexMulComp(sOverlay_EXP, rect.grid=c(1,3), title.rotate=0, title.xy=c(0.45,1), colormap=colormap, zlim=c(-0.3,0.3), gp=grid::gpar(cex=1), newpage=F)
Rankings of 100 TFs in terms of similarity (Pearson correlation) to the expression map in CML blast crisis phase (BC):
M_K562 <- sMe$codebook
M_EXP <- sOverlay_EXP$codebook
y <- M_EXP[,'BC']
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_EXP <- xPolarDot(df, colormap='blue-yellow-red', size=1.5, parallel=T, signature=F)
gp_EXP + labs(y="Pearson correlation") + scale_y_continuous(limits=c(-1,1))
Visualise TF map (reordered by correlation):
xMap <- sMe
ind <- match(df$TF, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=100:1, rect.grid=c(10,10), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)
Perform enrichment analysis for TFs:
## TFs (high correlation)
data <- subset(df, rank<=20)$TF
background <- NULL
eT_high <- xEnricherGenes(data=data, background=background, ontology="REACTOME", size.range=c(10,2000), test="fisher", min.overlap=5, ontology.algorithm="none", RData.location=RData.location)
## TFs (low correlation)
data <- subset(df, rank>=81)$TF
eT_low <- xEnricherGenes(data=data, background=background, ontology="REACTOME", size.range=c(10,2000), test="fisher", min.overlap=5, ontology.algorithm="none", RData.location=RData.location)
## visualisation
list_eTerm <- list(eT_low, eT_high)
names(list_eTerm) <- c('TFs (low correlation)', 'TFs (high correlation)')
bp <- xEnrichCompare(list_eTerm, displayBy="fc", bar.label.size=2.5, signature=F)
bp + theme(axis.text.y=element_text(size=10))
Enriched domains for TFs (high correlation):
xEnrichViewer(eT_high, top=NULL, details=T)
> name nAnno nOverlap fc zscore
> R-HSA-212436 Generic Transcription Pathway 1063 10 5.84 6.71
> R-HSA-74160 Gene expression (Transcription) 1307 11 5.23 6.58
> R-HSA-73857 RNA Polymerase II Transcription 1185 10 5.24 6.25
> R-HSA-3700989 Transcriptional Regulation by TP53 361 6 10.30 7.25
> R-HSA-4839726 Chromatin organization 224 5 13.90 7.82
> R-HSA-3247509 Chromatin modifying enzymes 224 5 13.90 7.82
> pvalue adjp distance
> R-HSA-212436 8.3e-07 2.5e-06 3
> R-HSA-74160 4.5e-07 2.5e-06 1
> R-HSA-73857 2.3e-06 4.6e-06 2
> R-HSA-3700989 1.3e-05 1.9e-05 4
> R-HSA-4839726 2.0e-05 2.0e-05 1
> R-HSA-3247509 2.0e-05 2.0e-05 2
> members
> R-HSA-212436 E2F4, ELF1, HDAC1, KDM5B, PML, POLR2A, RBBP5, TAF1, TBP, YY1
> R-HSA-74160 E2F4, ELF1, HDAC1, KDM5B, PML, POLR2A, RBBP5, TAF1, TBP, UBTF, YY1
> R-HSA-73857 E2F4, ELF1, HDAC1, KDM5B, PML, POLR2A, RBBP5, TAF1, TBP, YY1
> R-HSA-3700989 E2F4, HDAC1, PML, POLR2A, TAF1, TBP
> R-HSA-4839726 HDAC1, KDM5B, PHF8, RBBP5, SAP30
> R-HSA-3247509 HDAC1, KDM5B, PHF8, RBBP5, SAP30
Enriched domains for TFs (low correlation):
xEnrichViewer(eT_low, top=NULL, details=T)
> name nAnno nOverlap fc zscore
> R-HSA-74160 Gene expression (Transcription) 1307 9 4.28 5.10
> R-HSA-212436 Generic Transcription Pathway 1063 6 3.51 3.47
> R-HSA-73857 RNA Polymerase II Transcription 1185 6 3.14 3.16
> R-HSA-1280215 Cytokine Signaling in Immune system 898 5 3.46 3.10
> pvalue adjp distance
> R-HSA-74160 5.5e-05 0.00022 1
> R-HSA-212436 4.6e-03 0.00920 3
> R-HSA-73857 7.9e-03 0.01100 2
> R-HSA-1280215 1.1e-02 0.01100 2
> members
> R-HSA-74160 BRF1, EZH2, GATA1, JUNB, POLR3G, SMARCA4, SMARCB1, ZNF263, ZNF274
> R-HSA-212436 GATA1, JUNB, SMARCA4, SMARCB1, ZNF263, ZNF274
> R-HSA-73857 GATA1, JUNB, SMARCA4, SMARCB1, ZNF263, ZNF274
> R-HSA-1280215 EZH2, JUNB, SMARCA4, STAT1, STAT2
In this showcase, we analyse TF and CRISPR datasets in K562.
First of all, load the packages:
library(supraHex)
library(XGR)
# optionally, specify the local location of built-in RData
# by default:
RData.location <- "http://galahad.well.ox.ac.uk/bigdata_dev"
Then, import TF and CRISPR datasets in K562:
# extract TF, stored in a list of GR objects
ENCODE_TFBS_ClusteredV3_CellTypes <- xRDataLoader(RData.customised='ENCODE_TFBS_ClusteredV3_CellTypes', RData.location=RData.location)
# calculate gene-centric TF association scores
anno_gr <- ENCODE_TFBS_ClusteredV3_CellTypes$K562
res_ls <- lapply(1:length(anno_gr), function(i){
message(sprintf("Analysing %d '%s' (%s)", i, names(anno_gr)[i], as.character(Sys.time())), appendLF=T)
df_nGenes <- xGR2nGenes(data=anno_gr[[i]], format="GRanges", distance.max=50000, decay.kernel="slow", decay.exponent=2, scoring=T, scoring.scheme="max", RData.location=RData.location, verbose=F)
data.frame(df_nGenes, TF=rep(names(anno_gr)[i],nrow(df_nGenes)), stringsAsFactors=F)
})
df <- do.call(rbind, res_ls)
mat <- as.matrix(xSparseMatrix(df[,c("Gene","TF","Score")]))
# import CRISPR
data.file <- file.path(RData.location,'TW_Science_CRISPR.txt')
data <- read.delim(data.file, header=TRUE, stringsAsFactors=FALSE)
data <- subset(data, K562.adjusted.p.value<0.05)
# extract genes commons to CRISPR and TF datasets in K562
ind <- match(data$Gene, rownames(mat))
G2TF_K562_crispr <- mat[ind[!is.na(ind)],]
G2CRISPR_K562 <- data.frame(CRISPR=data$K562.CS[!is.na(ind)], stringsAsFactors=F)
rownames(G2CRISPR_K562) <- data$Gene[!is.na(ind)]
Justify the use of the triangle-shaped map:
data <- G2TF_K562_crispr
df <- as.data.frame(stats::prcomp(data)$x[,1:2])
gp <- ggplot(df, aes(x=PC1, y=PC2))
gp <- gp + geom_point(size=1,col='transparent') + theme_bw() + theme(panel.grid.major=element_blank(), panel.grid.minor=element_blank())
## 2d shape plot
myColor <- xColormap(colormap="spectral")(11)
gp <- gp + stat_binhex(bins=24) + scale_fill_gradientn(colours=myColor,trans="sqrt") + geom_density2d(colour="grey")
print(gp)
Train the triangle-shaped map using cross-TF gene scores:
data <- G2TF_K562_crispr
sM <- sPipeline(data, shape="triangle", xdim=12, finetuneSustain=T)
The CRISPR map is obtained by overlaying the CRISPR data onto the trained TF map in K562:
sOverlay_CRISPR <- sMapOverlay(sM, data=G2TF_K562_crispr, additional=G2CRISPR_K562)
Visualise CRISPR map:
colormap <- xColormap("jet.both")
visHexMulComp(sOverlay_CRISPR, rect.grid=c(1,1), title.rotate=0, title.xy=c(0.4,1), colormap=colormap, zlim=c(-4,4), gp=grid::gpar(cex=1), newpage=F)
Rankings of 100 TFs in terms of similarity (Pearson correlation) to the CRISPR map in K562:
M_K562 <- sM$codebook
M_CRISPR <- sOverlay_CRISPR$codebook
y <- M_CRISPR[,1]
names(y) <- 1:length(y)
ls_df <- lapply(1:ncol(M_K562), function(i){
x <- data.frame(name=1:nrow(M_K562), M_K562[,i], stringsAsFactors=F)
ls_res <- xCorrelation(x, y, method="pearson", p.type="nominal")
df_summary <- ls_res$df_summary
data.frame(TF=colnames(M_K562)[i], cor=df_summary$cor, stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
df$rank <- rank(-1*df$cor,ties.method="first")
df <- df %>% dplyr::arrange(rank)
## polar plot
gp_CRISPR <- xPolarDot(df, size=1.5, parallel=T, signature=F)
gp_CRISPR + labs(y="Pearson correlation") + scale_y_reverse(limits=c(0,-1))
Visualise TF map (reordered by correlation):
xMap <- sM
ind <- match(df$TF, colnames(xMap$codebook))
xMap$codebook <- xMap$codebook[,ind]
colormap <- xColormap("jet.top")
visHexMulComp(xMap, which.components=100:1, rect.grid=c(10,10), title.rotate=0, title.xy=c(0.3,1.05), colormap=colormap, zlim=c(0,1), gp=grid::gpar(cex=0.6), newpage=F)
Obtain gene clusters based on TF map:
xM <- sM
ind <- match(df$TF, colnames(xM$codebook))
xM$codebook <- xM$codebook[,ind]
seed <- sDmatMinima(xM)
sBase <- sDmatCluster(xM, which_neigh=1)
colormap <- xColormap("spectral")
visDmatCluster(xM, sBase, colormap=colormap, newpage=F)
Calculate CRISPR scores per gene cluster:
colormap <- xColormap("jet.top")
output <- visDmatHeatmap(xM, data=G2TF_K562_crispr[,ind], sBase, base.color=xColormap("spectral"), base.separated.arg=list(col="white",lty=1,lwd=1), base.legend.location="bottomleft", colormap=colormap, zlim=c(0,1), KeyValueName="TF-gene association", labRow=NA, srtCol=90, cexCol=0.6, keep.data=T)
## Polar barplot
ind <- match(output$ID, rownames(G2CRISPR_K562))
output$val <- G2CRISPR_K562[ind,]
ls_tmp <- split(x=output$val, f=output$Cluster_base)
ls_df <- lapply(1:length(ls_tmp), function(i){
x <- ls_tmp[[i]]
y <- median(x)
data.frame(Meta=paste0('C',names(ls_tmp)[i]), Val=y, Num=length(x), stringsAsFactors=F)
})
df <- do.call(rbind, ls_df)
gp <- xPolarBar(df, colormap='spectral', size.name=10, size.value=2.5, parallel=F, signature=F)
gp
Perform enrichment analysis for clustered genes:
ls_cluster <- split(x=output$ID, f=output$Cluster_base)
ontologies <- c("SF","GOMF","REACTOME")
background <- output$ID
FDR.cutoff <- 0.01
ls_mat <- lapply(1:length(ontologies), function(k){
ontology <- ontologies[k]
message(sprintf("Analysing '%s' (%s) ...", ontology, as.character(Sys.time())), appendLF=T)
ls_df <- lapply(1:length(ls_cluster), function(i){
message(sprintf("\t'%s' %d (%s) ...", ontology, i, as.character(Sys.time())), appendLF=T)
eTerm <- xEnricherGenes(data=ls_cluster[[i]], ontology=ontology, background=background, size.range=c(10,2000), test="fisher", min.overlap=10, p.tail="one-tail", RData.location=RData.location, verbose=F, silent=T)
df <- xEnrichViewer(eTerm, top_num="all", sortBy="none", details=T)
if(is.null(df)){
return(NULL)
}else{
df$namespace <- rep(ontology, nrow(df))
cbind(cluster=rep(paste0('C',names(ls_cluster)[i]),nrow(df)), id=rownames(df), df, stringsAsFactors=F)
}
})
df_all <- do.call(rbind, ls_df)
ind <- which(df_all$adjp < FDR.cutoff)
if(length(ind)>=2){
df <- as.data.frame(df_all %>% dplyr::filter(adjp < FDR.cutoff))
}else{
return(NULL)
}
mat_fdr <- as.matrix(xSparseMatrix(df[,c('name','cluster','adjp')], rows=unique(df$name), columns=paste0('C',1:length(ls_cluster))))
mat_fdr[mat_fdr==0] <- NA
mat_fdr <- -log10(mat_fdr)
mat_fdr <- mat_fdr[order(-nchar(rownames(mat_fdr))), ]
as.data.frame(mat_fdr)
})
mat <- do.call(rbind, ls_mat)
gp <- xHeatmap(mat, reorder="none", colormap='skyblue-darkorange', ncolors=64, zlim=c(2,8), legend.title="Log10(FDR)\n", barwidth=0.4, x.rotate=60, shape=19, size=2, x.text.size=6,y.text.size=6, na.color='transparent',barheight=5)
gp + theme(legend.title=element_text(size=8))